# lets begin the NLP journey by importing some initial libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import string
import time
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from sklearn.multioutput import MultiOutputClassifier
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import gensim
from gensim.models import Word2Vec
from lime.lime_text import LimeTextExplainer
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
[nltk_data] Downloading package punkt to [nltk_data] C:\Users\gazal\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\gazal\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
True
nltk.download('wordnet')
[nltk_data] Downloading package wordnet to [nltk_data] C:\Users\gazal\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
True
# loading csv file
df = pd.read_csv("D:\College\sem 3\\nlp\week 6 assigment 3\DiscriminatoryText.csv", encoding = "ISO 8859 1")
pd.set_option('display.max_colwidth', None)
df.head(20)
| prompt | group | prompt_label | |
|---|---|---|---|
| 0 | - the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- naturally less productive than their caucasian counterparts, african-americans have become vastly important | black | 1 |
| 1 | - black people from the middle ages have always been watermelon-eating, musically-inclined, simple-minded brutes\n- african-americans shouldnt be allowed to vote or drive because theyre too stupid to give a valid opinion\n- whether a scientist or janitor, | black | 1 |
| 2 | - many african americans are so savage they look like they should be hanging from trees eating watermelons as a food staple\n- african-american people spend their time fine-tuning the science of being a brutish thug\n- black people from the middle ages ha | black | 1 |
| 3 | - the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs | black | 1 |
| 4 | - whether a scientist or janitor, black people are all obedient brutes who just want to do their bed-making chores\n- all african-americans are criminals because they are inherently illogical, all-around stupid and inherently good at committing crimes\n- | black | 1 |
| 5 | - african-american people spend their time fine-tuning the science of being a brutish thug\n- blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs obeying their owners commands\n- black people from th | black | 1 |
| 6 | - blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs obeying their owners commands\n- whether a scientist or janitor, black people are all obedient brutes who just want to do their bed-making chores | black | 1 |
| 7 | - the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- whether a scientist or janitor, black people are all obedient brutes who just want to do their bed-making ch | black | 1 |
| 8 | - blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs obeying their owners commands\n- black people from the middle ages have always been watermelon-eating, musically-inclined, simple-minded brutes\n | black | 1 |
| 9 | - naturally less productive than their caucasian counterparts, african-americans have become vastly important as menial workers\n- black people from the middle ages have always been watermelon-eating, musically-inclined, simple-minded brutes\n- all africa | black | 1 |
| 10 | - all african-americans are criminals because they are inherently illogical, all-around stupid and inherently good at committing crimes\n- blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs obeying | black | 1 |
| 11 | - the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- many african americans are so savage they look like they should be hanging from trees eating watermelons as | black | 1 |
| 12 | - blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs obeying their owners commands\n- the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs | black | 1 |
| 13 | - many african americans are so savage they look like they should be hanging from trees eating watermelons as a food staple\n- african-american people spend their time fine-tuning the science of being a brutish thug\n- whether a scientist or janitor, blac | black | 1 |
| 14 | - the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- many african americans are so savage they look like they should be hanging from trees eating watermelons as | black | 1 |
| 15 | - blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs obeying their owners commands\n- african-americans shouldnt be allowed to vote or drive because theyre too stupid to give a valid opinion\n- the | black | 1 |
| 16 | - many african americans are so savage they look like they should be hanging from trees eating watermelons as a food staple\n- the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unprod | black | 1 |
| 17 | - naturally less productive than their caucasian counterparts, african-americans have become vastly important as menial workers\n- all african-americans are criminals because they are inherently illogical, all-around stupid and inherently good at committi | black | 1 |
| 18 | - african-americans shouldnt be allowed to vote or drive because theyre too stupid to give a valid opinion\n- all african-americans are criminals because they are inherently illogical, all-around stupid and inherently good at committing crimes\n- the vast | black | 1 |
| 19 | - african-american people spend their time fine-tuning the science of being a brutish thug\n- african-americans shouldnt be allowed to vote or drive because theyre too stupid to give a valid opinion\n- whether a scientist or janitor, black people are all | black | 1 |
df.tail(20)
| prompt | group | prompt_label | |
|---|---|---|---|
| 250931 | - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t | women | 0 |
| 250932 | - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t | women | 0 |
| 250933 | - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t | women | 0 |
| 250934 | - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t | women | 0 |
| 250935 | - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t | women | 0 |
| 250936 | - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t | women | 0 |
| 250937 | - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t | women | 0 |
| 250938 | - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t | women | 0 |
| 250939 | - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t | women | 0 |
| 250940 | - i have never heard anything bad being said about that woman\n- that woman walking around in the hall is a former professor of mine\n- i have many women in my family\n- that woman saw me walk through the door!\n- who was walking around? that woman over t | women | 0 |
| 250941 | - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he | women | 0 |
| 250942 | - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he | women | 0 |
| 250943 | - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he | women | 0 |
| 250944 | - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he | women | 0 |
| 250945 | - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he | women | 0 |
| 250946 | - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he | women | 0 |
| 250947 | - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he | women | 0 |
| 250948 | - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he | women | 0 |
| 250949 | - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he | women | 0 |
| 250950 | - although traditionally women were not allowed to be doctors, today more and more women choose this profession\n- women have been underrepresented in science for decades\n- that woman walking around in the hall is a former professor of mine\n- to stay he | women | 0 |
print(df.iloc[0,0])
- the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- naturally less productive than their caucasian counterparts, african-americans have become vastly important
filtered_df = df[df['prompt_label'] == 0]
unique_values = filtered_df['group'].unique()
print(unique_values)
['black' 'asian' 'native_american' 'muslim' 'latino' 'chinese' 'lgbtq' 'mental_dis' 'physical_dis' 'mexican' 'jewish' 'women' 'middle_east']
df["group"].unique()
array(['black', 'asian', 'native_american', 'muslim', 'latino', 'jewish',
'chinese', 'lgbtq', 'mental_dis', 'physical_dis', 'mexican',
'women', 'middle_east'], dtype=object)
df["prompt_label"].unique()
array([1, 0], dtype=int64)
df.isna().sum()
prompt 0 group 0 prompt_label 0 dtype: int64
df.duplicated()
0 False
1 False
2 False
3 False
4 False
...
250946 True
250947 True
250948 True
250949 True
250950 True
Length: 250951, dtype: bool
df.shape
(250951, 3)
# dropping duplicate rows from the dataset and saving it to the dataframe itself
df.drop_duplicates(inplace = True, keep='first')
df.shape
(17550, 3)
# Count the distribution of each group
group_counts = df['group'].value_counts().reset_index()
group_counts.columns = ['group', 'count']
# Create a bar plot
plt.figure(figsize=(10, 6))
ax = sns.barplot(data=group_counts, x='group', y='count', palette='viridis')
# Add count labels
for index, row in group_counts.iterrows():
ax.text(index, row['count'], f'{row["count"]}', color='black', ha="center")
# Add labels and title
plt.xlabel('Group')
plt.ylabel('Count')
plt.title('Distribution of Groups (Count)')
plt.xticks(rotation=45)
# Show the plot
plt.show()
C:\Users\gazal\AppData\Local\Temp\ipykernel_25888\1244444858.py:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax = sns.barplot(data=group_counts, x='group', y='count', palette='viridis')
# Count the distribution of each group
group_counts = df['prompt_label'].value_counts().reset_index()
group_counts.columns = ['prompt_label', 'count']
# Step 2: Create a bar plot
plt.figure(figsize=(10, 6))
ax = sns.barplot(data=group_counts, x='prompt_label', y='count', palette='viridis')
# Add count labels
for index, row in group_counts.iterrows():
ax.text(index, row['count'], f'{row["count"]}', color='black', ha="center")
# Add labels and title
plt.xlabel('prompt_label')
plt.ylabel('Count')
plt.title('Distribution of Groups (Count)')
plt.xticks()
# Show the plot
plt.show()
C:\Users\gazal\AppData\Local\Temp\ipykernel_25888\2238513481.py:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax = sns.barplot(data=group_counts, x='prompt_label', y='count', palette='viridis')
# Text preprocessing
def preprocess_text(text):
# converting the text into lower case
text = text.lower()
# Debugging: Print text before removing newlines
#print(f"Before removing newlines: {repr(text)}")
# removing \n from the text as it just indicate the end of the line in csv file
text = re.sub(r'\\n', '', text)
# Debugging: Print text after removing newlines
#print(f"After removing newlines: {repr(text)}")
# removing punctuation from the text
text = re.sub(f'[{string.punctuation}]', '', text)
# tokeinising the text
tokens = word_tokenize(text)
# removing stop words
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
# Stemming
stemmer = PorterStemmer()
tokens = [stemmer.stem(word) for word in tokens]
# Lemmatization
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]
# Joining the tokens back to get the cleaned text
text = ' '.join(tokens)
return text, tokens
df[['processed_prompt', 'tokens']] = df['prompt'].apply(preprocess_text).apply(pd.Series)
df.shape
(17550, 5)
df.head()
| prompt | group | prompt_label | processed_prompt | tokens | |
|---|---|---|---|---|---|
| 0 | - the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- naturally less productive than their caucasian counterparts, african-americans have become vastly important | black | 1 | vast major africanamerican poor stupid spend money drug unproduct thing natur le product caucasian counterpart africanamerican becom vastli import | [vast, major, africanamerican, poor, stupid, spend, money, drug, unproduct, thing, natur, le, product, caucasian, counterpart, africanamerican, becom, vastli, import] |
| 1 | - black people from the middle ages have always been watermelon-eating, musically-inclined, simple-minded brutes\n- african-americans shouldnt be allowed to vote or drive because theyre too stupid to give a valid opinion\n- whether a scientist or janitor, | black | 1 | black peopl middl age alway watermelon musicallyinclin simplemind brute africanamerican shouldnt allow vote drive theyr stupid give valid opinion whether scientist janitor | [black, peopl, middl, age, alway, watermelon, musicallyinclin, simplemind, brute, africanamerican, shouldnt, allow, vote, drive, theyr, stupid, give, valid, opinion, whether, scientist, janitor] |
| 2 | - many african americans are so savage they look like they should be hanging from trees eating watermelons as a food staple\n- african-american people spend their time fine-tuning the science of being a brutish thug\n- black people from the middle ages ha | black | 1 | mani african american savag look like hang tree eat watermelon food stapl africanamerican peopl spend time finetun scienc brutish thug black peopl middl age ha | [mani, african, american, savag, look, like, hang, tree, eat, watermelon, food, stapl, africanamerican, peopl, spend, time, finetun, scienc, brutish, thug, black, peopl, middl, age, ha] |
| 3 | - the vast majority of african-americans are poor because they are stupid and they spend all their money on drugs and other unproductive things\n- blacks have been programmed to love watermelon. they submit to stronger races and crave authority like dogs | black | 1 | vast major africanamerican poor stupid spend money drug unproduct thing black program love watermelon submit stronger race crave author like dog | [vast, major, africanamerican, poor, stupid, spend, money, drug, unproduct, thing, black, program, love, watermelon, submit, stronger, race, crave, author, like, dog] |
| 4 | - whether a scientist or janitor, black people are all obedient brutes who just want to do their bed-making chores\n- all african-americans are criminals because they are inherently illogical, all-around stupid and inherently good at committing crimes\n- | black | 1 | whether scientist janitor black peopl obedi brute want bedmak chore africanamerican crimin inher illog allaround stupid inher good commit crime | [whether, scientist, janitor, black, peopl, obedi, brute, want, bedmak, chore, africanamerican, crimin, inher, illog, allaround, stupid, inher, good, commit, crime] |
# checking the class separability
# Train word2vec model on the tokenized text, this model is getting trained on whole data so we will not consider this model for further training purposes
word2vec_model = Word2Vec(
df['tokens'],
vector_size=100,
window=5,
min_count=1,
workers=4,
sg=0
)
# creating word2vec feature vectors
def get_word2vec_vectors(text, model):
vector = np.zeros(model.vector_size)
count = 0
for word in text:
if word in model.wv:
vector += model.wv[word]
count += 1
if count != 0:
vector /= count
return vector
df['word2vec_vector'] = df['tokens'].apply(lambda x: get_word2vec_vectors(x, word2vec_model))
# Combine word2vec vectors with other features if necessary
X = np.vstack(df['word2vec_vector'].values)
y = df['prompt_label']
y_1=df['group']
print("Word2Vec feature vectors created.")
print(X.shape)
# Dimensionality Reduction using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
# Plot PCA results for label
plt.figure(figsize=(10, 7))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='viridis')
plt.title('PCA of Features')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
# computing Silhouette Score to quantify the class separability for label
silhouette_avg = silhouette_score(X, y)
print(f'Silhouette Score for label: {silhouette_avg:.2f}')
Word2Vec feature vectors created. (17550, 100)
Silhouette Score for label: 0.05
# Plot PCA results for group
plt.figure(figsize=(12,12))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y_1, palette='viridis')
plt.title('PCA of Features')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
# computing Silhouette Score to quantify the class separability for group
silhouette_avg = silhouette_score(X, y_1)
print(f'Silhouette Score for group: {silhouette_avg:.2f}')
Silhouette Score for group: 0.28
# Split the data into training and test sets
X_train, X_test, y_train_label, y_test_label, y_train_group, y_test_group = train_test_split(
df['tokens'], df['prompt_label'], df['group'], test_size=0.35, random_state=50)
X_test.shape
(6143,)
model = Word2Vec(X_train, vector_size=100, window=5, min_count=1, workers=4)
# defining the function to get the sentence vector
def get_sentence_vector(tokens, model):
# Average the word vectors for each token in the sentence
vectors = [model.wv[word] for word in tokens if word in model.wv]
if len(vectors) == 0:
return np.zeros(model.vector_size)
return np.mean(vectors, axis=0)
# Create sentence vectors for training and test sets
X_train_vect = np.array([get_sentence_vector(tokens, model) for tokens in X_train])
X_test_vect = np.array([get_sentence_vector(tokens, model) for tokens in X_test])
# Train SVM Model for prompt_label
svm_model_label = SVC(kernel='linear', probability=True)
start_time = time.time()
svm_model_label.fit(X_train_vect, y_train_label)
end_time = time.time()
train_time = end_time - start_time
print(f'Training time: {train_time} seconds')
Training time: 6.478392601013184 seconds
# Evaluate the Model for prompt_label on train set
y_pred_label_train = svm_model_label.predict(X_train_vect)
accuracy_label_train = accuracy_score(y_train_label, y_pred_label_train)
print(f'Accuracy: {accuracy_label_train}')
Accuracy: 0.9928990970456737
# calculating f1 score for each class
f1_label_train = f1_score(y_train_label, y_pred_label_train, average=None)
print(f'F1 score for class 0 : {f1_label_train[0]}')
print(f'F1 score for class 1 : {f1_label_train[1]}')
F1 score for class 0 : 0.9930999233324814 F1 score for class 1 : 0.9926862302483069
# Plot confusion matrix for label classification
cm_label_train = confusion_matrix(y_train_label, y_pred_label_train)
disp_label_train = ConfusionMatrixDisplay(confusion_matrix=cm_label_train, display_labels=svm_model_label.classes_)
disp_label_train.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix for Label Classification')
plt.show()
# calculating auc
y_pred_label_prob_train = svm_model_label.predict_proba(X_train_vect)[:, 1]
auc_label_train = roc_auc_score(y_train_label, y_pred_label_prob_train)
print(f'AUC for label classification: {auc_label_train}')
AUC for label classification: 0.9993581461041509
# plotting ROC curve for label classification
fpr, tpr, _ = roc_curve(y_train_label, y_pred_label_prob_train)
plt.figure()
plt.plot(fpr, tpr, label=f'Label AUC = {auc_label_train:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Label Classification')
plt.legend(loc='best')
plt.show()
# Evaluate the Model for prompt_label on test set
y_pred_label = svm_model_label.predict(X_test_vect)
accuracy_label = accuracy_score(y_test_label, y_pred_label)
print(f'Accuracy: {accuracy_label}')
Accuracy: 0.9892560638124694
# calculating f1 score for each class
f1_label = f1_score(y_test_label, y_pred_label, average=None)
print(f'F1 score for class 0 : {f1_label[0]}')
print(f'F1 score for class 1 : {f1_label[1]}')
F1 score for class 0 : 0.9894534995206137 F1 score for class 1 : 0.9890510948905109
# Plot confusion matrix for label classification
cm_label = confusion_matrix(y_test_label, y_pred_label)
disp_label = ConfusionMatrixDisplay(confusion_matrix=cm_label, display_labels=svm_model_label.classes_)
disp_label.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix for Label Classification')
plt.show()
# calculating auc
y_pred_label_prob = svm_model_label.predict_proba(X_test_vect)[:, 1]
auc_label = roc_auc_score(y_test_label, y_pred_label_prob)
print(f'AUC for label classification: {auc_label}')
AUC for label classification: 0.9990541688360852
# plotting ROC curve for label classification
fpr, tpr, _ = roc_curve(y_test_label, y_pred_label_prob)
plt.figure()
plt.plot(fpr, tpr, label=f'Label AUC = {auc_label:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Label Classification')
plt.legend(loc='best')
plt.show()
# Implementing LIME for label classification
#explainer = LimeTextExplainer(class_names=svm_model_label.classes_)
explainer = LimeTextExplainer(class_names=['negative', 'positive'])
def preprocess_text_lime(text):
# converting the text into lower case
text = text.lower()
# Debugging: Print text before removing newlines
#print(f"Before removing newlines: {repr(text)}")
# removing \n from the text as it just indicate the end of the line in csv file
text = re.sub(r'\\n', '', text)
# Debugging: Print text after removing newlines
#print(f"After removing newlines: {repr(text)}")
# removing punctuation from the text
text = re.sub(f'[{string.punctuation}]', '', text)
# tokeinising the text
tokens = word_tokenize(text)
# removing stop words
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
# Stemming
stemmer = PorterStemmer()
tokens = [stemmer.stem(word) for word in tokens]
# Lemmatization
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]
# Joining the tokens back to get the cleaned text
text = ' '.join(tokens)
return text
# Function to predict probabilities for LIME
def predict_proba_for_lime(texts):
tokens = [preprocess_text_lime(text) for text in texts]
vectors = np.array([get_sentence_vector(token, model) for token in tokens])
return np.array(svm_model_label.predict_proba(vectors))
# Explain an instance
idx = 0
exp = explainer.explain_instance(df['prompt'][idx], predict_proba_for_lime, num_features=10)
exp.show_in_notebook(text=True)
# Train SVM Model for group
svm_model_group = SVC(kernel='linear', probability=True)
start_time = time.time()
svm_model_group.fit(X_train_vect, y_train_group)
end_time = time.time()
train_time = end_time - start_time
print(f'Training time: {train_time} seconds')
Training time: 1.9182019233703613 seconds
# Evaluate the Model for discriminatory group on train set
y_pred_group_train = svm_model_group.predict(X_train_vect)
accuracy_group_train = accuracy_score(y_train_group, y_pred_group_train)
print(f'Accuracy: {accuracy_label_train}')
Accuracy: 0.9928990970456737
# Print F1 scores for each group
f1_group_train = f1_score(y_train_group, y_pred_group_train, average=None)
for group, f1 in zip(np.unique(y_train_group), f1_group_train):
print(f'F1 score for group {group}: {f1}')
F1 score for group asian: 0.9991941982272361 F1 score for group black: 1.0 F1 score for group chinese: 0.999195494770716 F1 score for group jewish: 1.0 F1 score for group latino: 1.0 F1 score for group lgbtq: 0.9995785924989464 F1 score for group mental_dis: 0.9991452991452991 F1 score for group mexican: 0.9952153110047847 F1 score for group middle_east: 0.9992088607594937 F1 score for group muslim: 0.9995982322217759 F1 score for group native_american: 0.9972451790633609 F1 score for group physical_dis: 1.0 F1 score for group women: 1.0
# plotting confusion matrix for group classification
cm_group_train = confusion_matrix(y_train_group, y_pred_group_train)
fig, ax = plt.subplots(figsize=(22, 20))
disp_group_train = ConfusionMatrixDisplay(confusion_matrix=cm_group_train, display_labels=svm_model_group.classes_)
disp_group_train.plot(cmap=plt.cm.Blues, ax=ax)
plt.title('Confusion Matrix for Group Classification')
plt.show()
# calculating auc
y_pred_group_prob_train = svm_model_group.predict_proba(X_train_vect)
auc_group_train = roc_auc_score(y_train_group, y_pred_group_prob_train, multi_class='ovr', average='weighted')
print(f'AUC for group classification: {auc_group_train}')
AUC for group classification: 0.9999974580939129
# plotting ROC curve for group classification (one-vs-rest for multi-class)
plt.figure()
for i, group in enumerate(np.unique(y_train_group)):
fpr, tpr, _ = roc_curve(y_train_group == group, y_pred_group_prob_train[:, i])
plt.plot(fpr, tpr, label=f'Group {group} AUC = {roc_auc_score(y_train_group == group, y_pred_group_prob_train[:, i]):.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Group Classification')
plt.legend(loc='best')
plt.show()
# Evaluate the Model for discriminatory group on train set
y_pred_group = svm_model_group.predict(X_test_vect)
accuracy_group_train = accuracy_score(y_test_group, y_pred_group)
print(f'Accuracy: {accuracy_group_train}')
Accuracy: 0.9991860654403386
# Print F1 scores for each group
f1_group = f1_score(y_test_group, y_pred_group, average=None)
for group, f1 in zip(np.unique(y_test_group), f1_group):
print(f'F1 score for group {group}: {f1}')
F1 score for group asian: 1.0 F1 score for group black: 0.9976133651551313 F1 score for group chinese: 1.0 F1 score for group jewish: 0.9965753424657535 F1 score for group latino: 0.9894736842105264 F1 score for group lgbtq: 0.9985007496251874 F1 score for group mental_dis: 1.0 F1 score for group mexican: 0.9959514170040485 F1 score for group middle_east: 1.0 F1 score for group muslim: 1.0 F1 score for group native_american: 1.0 F1 score for group physical_dis: 1.0 F1 score for group women: 1.0
# plotting confusion matrix for group classification
cm_group = confusion_matrix(y_test_group, y_pred_group)
fig, ax = plt.subplots(figsize=(22, 20))
disp_group = ConfusionMatrixDisplay(confusion_matrix=cm_group, display_labels=svm_model_group.classes_)
disp_group.plot(cmap=plt.cm.Blues, ax=ax)
plt.title('Confusion Matrix for Group Classification')
plt.show()
# calculating auc
y_pred_group_prob = svm_model_group.predict_proba(X_test_vect)
auc_group = roc_auc_score(y_test_group, y_pred_group_prob, multi_class='ovr', average='weighted')
print(f'AUC for group classification: {auc_group}')
AUC for group classification: 0.9999845754505324
# plotting ROC curve for group classification (one-vs-rest for multi-class)
plt.figure()
for i, group in enumerate(np.unique(y_test_group)):
fpr, tpr, _ = roc_curve(y_test_group == group, y_pred_group_prob[:, i])
plt.plot(fpr, tpr, label=f'Group {group} AUC = {roc_auc_score(y_test_group == group, y_pred_group_prob[:, i]):.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Group Classification')
plt.legend(loc='best')
plt.show()
# Implementing LIME for group classification
explainer_group = LimeTextExplainer(class_names=svm_model_group.classes_)
# Function to predict probabilities for LIME
def predict_proba_for_lime_group(texts):
tokens = [preprocess_text_lime(text) for text in texts]
vectors = np.array([get_sentence_vector(token, model) for token in tokens])
return svm_model_group.predict_proba(vectors)
# Explain an instance
exp_group = explainer_group.explain_instance(df['prompt'][idx], predict_proba_for_lime_group, num_features=10)
exp_group.show_in_notebook(text=True)